library(Matrix)
library(Matrix.utils)
library(tidyverse)
## ── Attaching core tidyverse packages ─────────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.1     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ───────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ purrr::discard()    masks scales::discard()
## ✖ tidyr::expand()     masks Matrix::expand()
## ✖ dplyr::filter()     masks stats::filter()
## ✖ dplyr::lag()        masks stats::lag()
## ✖ tidyr::pack()       masks Matrix::pack()
## ✖ tidyr::unpack()     masks Matrix::unpack()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(Seurat)
## Attaching SeuratObject
library(dplyr)
library(knitr)
library(ggsci)
library(RCurl)
## Warning: package 'RCurl' was built under R version 4.2.3
## 
## Attaching package: 'RCurl'
## 
## The following object is masked from 'package:tidyr':
## 
##     complete
library(cowplot)
## 
## Attaching package: 'cowplot'
## 
## The following object is masked from 'package:lubridate':
## 
##     stamp
library(SingleR)
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## 
## Attaching package: 'matrixStats'
## 
## The following object is masked from 'package:dplyr':
## 
##     count
## 
## 
## Attaching package: 'MatrixGenerics'
## 
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## 
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## 
## The following objects are masked from 'package:lubridate':
## 
##     intersect, setdiff, union
## 
## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union
## 
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## 
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which.max, which.min
## 
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## 
## The following objects are masked from 'package:lubridate':
## 
##     second, second<-
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, rename
## 
## The following object is masked from 'package:tidyr':
## 
##     expand
## 
## The following objects are masked from 'package:Matrix':
## 
##     expand, unname
## 
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## 
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## 
## The following object is masked from 'package:lubridate':
## 
##     %within%
## 
## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice
## 
## The following object is masked from 'package:purrr':
## 
##     reduce
## 
## Loading required package: GenomeInfoDb
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## 
## Attaching package: 'Biobase'
## 
## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians
## 
## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians
## 
## 
## Attaching package: 'SummarizedExperiment'
## 
## The following object is masked from 'package:SeuratObject':
## 
##     Assays
## 
## The following object is masked from 'package:Seurat':
## 
##     Assays
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(multtest)
library(clustree)
## Loading required package: ggraph
library(limma)
## 
## Attaching package: 'limma'
## 
## The following object is masked from 'package:BiocGenerics':
## 
##     plotMA

#Quality control metrices

load("~/work/PI3K/data/SeuratObjects/merged/Human/merged_seurat.RData")
merged_filtered_seurat <- subset(merged_seurat, subset = Sample_Tag != 'NA' & Sample_Tag != 'Undetermined' & Sample_Tag != 'SampleTag01_hs' & Sample_Tag != 'Multiplet')

merged_filtered_seurat$nCount_RNA <- NULL
merged_filtered_seurat$nFeature_RNA <- NULL
merged_filtered_seurat_meta <- merged_filtered_seurat@meta.data
subset_control <- subset(merged_filtered_seurat_meta, Sample_Tag == "Control" | Sample_Tag == "Control_tagged")
subset_alpelisib <- subset(merged_filtered_seurat_meta, Sample_Tag == "Alpelisib")
subset_copanlisib <- subset(merged_filtered_seurat_meta, Sample_Tag == "Copanlisib")
#cell count
subset_control %>% 
    ggplot(aes(x=Xenograft, fill=Xenograft)) + 
    geom_bar() +
    theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
    theme(plot.title = element_text(hjust=0.5, face="bold")) +
    ggtitle("NCells from the untreated xenografts")+
    scale_fill_lancet()
The figures are showing different quality metrices for the control samples.

The figures are showing different quality metrices for the control samples.

# Extract identity and sample information from dataframe to determine the number of cells per sample_tag
n_cells_control<- table(subset_control$Sample_Tag)

# View table
kable(n_cells_control,
      col.names = c("Sample_Tag",
                    "nCells"),
      caption = "The table shows the number of cells which were untreated.")
The table shows the number of cells which were untreated.
Sample_Tag nCells
Control 16260
#UMI count
subset_control %>% 
    ggplot(aes(color=Xenograft, x=nUMI, fill=Xenograft)) + 
    geom_density(alpha = 0.2) + 
    scale_x_log10() + 
    ylab("Cell density") +
    geom_vline(xintercept = 500)+
    theme(plot.title = element_text(hjust=0.5, face="bold")) +
    ggtitle("nUMi from the control xenografts")+
    scale_fill_lancet()
The figures are showing different quality metrices for the control samples.

The figures are showing different quality metrices for the control samples.

#Gene Count
subset_control %>% 
    ggplot(aes(color=Xenograft, x=nGene, fill=Xenograft)) + 
    geom_density(alpha = 0.2) + 
    scale_x_log10() + 
    geom_vline(xintercept = 300)+
    theme(plot.title = element_text(hjust=0.5, face="bold")) +
    ggtitle("nGene from the control xenografts")+
    scale_fill_lancet()
The figures are showing different quality metrices for the control samples.

The figures are showing different quality metrices for the control samples.

#joint filtering effect
subset_control %>%
    ggplot(aes(x=nUMI, y=nGene)) +
    geom_point() +
    scale_colour_gradient(low = "gray90", high = "black") +
    stat_smooth(method=lm) +
    scale_x_log10() +
    scale_y_log10() +
    theme_classic() +
    geom_vline(xintercept = 500) +
    geom_hline(yintercept = 250) +
    facet_wrap(~Sample_Tag)+
    scale_fill_lancet()
## `geom_smooth()` using formula = 'y ~ x'
The figures are showing different quality metrices for the control samples.

The figures are showing different quality metrices for the control samples.

#Mitochondrial count
subset_control %>% 
    ggplot(aes(color=Xenograft, x=percent.mt, fill=Xenograft)) + 
    geom_density(alpha = 0.2) + 
    scale_x_log10() +
    geom_vline(xintercept = 2)+
    theme(plot.title = element_text(hjust=0.5, face="bold")) +
    ggtitle("Mitochondrial gene expression in [%] \n from the control xenografts")+
    scale_fill_lancet()
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 22 rows containing non-finite values (`stat_density()`).
The figures are showing different quality metrices for the control samples.

The figures are showing different quality metrices for the control samples.

#cell count
subset_copanlisib %>% 
    ggplot(aes(x=Xenograft, fill=Xenograft)) + 
    geom_bar() +
    theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
    theme(plot.title = element_text(hjust=0.5, face="bold")) +
    ggtitle("NCells from the xenografts treated with Copanlisib")+
    scale_fill_lancet()
The figures are showing different quality metrices for the copanlisib samples.

The figures are showing different quality metrices for the copanlisib samples.

# Extract identity and sample information from dataframe to determine the number of cells per sample_tag
n_cells_copanlisib<- table(subset_copanlisib$Sample_Tag)

# View table
kable(n_cells_copanlisib,
      col.names = c("Sample_Tag",
                    "nCells"),
      caption = "The table shows the number of cells which were treated with Copanlisib.")
The table shows the number of cells which were treated with Copanlisib.
Sample_Tag nCells
Copanlisib 2591
#UMI count
subset_copanlisib %>% 
    ggplot(aes(color=Xenograft, x=nUMI, fill=Xenograft)) + 
    geom_density(alpha = 0.2) + 
    scale_x_log10() +
    ylab("Cell density") +
    geom_vline(xintercept = 500)+
    theme(plot.title = element_text(hjust=0.5, face="bold")) +
    ggtitle("nUMI from the xenografts treated with Copanlisib")+
    scale_fill_lancet()
The figures are showing different quality metrices for the copanlisib samples.

The figures are showing different quality metrices for the copanlisib samples.

#Gene Count
subset_copanlisib %>% 
    ggplot(aes(color=Xenograft, x=nGene, fill=Xenograft)) + 
    geom_density(alpha = 0.2) +
    scale_x_log10() + 
    geom_vline(xintercept = 300)+
    theme(plot.title = element_text(hjust=0.5, face="bold")) +
    ggtitle("nGene from the xenografts treated with Copanlisib")+
    scale_fill_lancet()
The figures are showing different quality metrices for the copanlisib samples.

The figures are showing different quality metrices for the copanlisib samples.

#joint filtering effect
subset_copanlisib %>%
    ggplot(aes(x=nUMI, y=nGene)) +
    geom_point() +
    scale_colour_gradient(low = "gray90", high = "black") +
    stat_smooth(method=lm) +
    scale_x_log10() +
    scale_y_log10() +
    geom_vline(xintercept = 500) +
    geom_hline(yintercept = 250) +
    facet_wrap(~Sample_Tag)+
    scale_fill_lancet()
## `geom_smooth()` using formula = 'y ~ x'
The figures are showing different quality metrices for the copanlisib samples.

The figures are showing different quality metrices for the copanlisib samples.

#Mitochondrial count
subset_copanlisib %>% 
    ggplot(aes(color=Xenograft, x=percent.mt, fill=Xenograft)) + 
    geom_density(alpha = 0.2) + 
    scale_x_log10() + 
    geom_vline(xintercept = 2)+
    theme(plot.title = element_text(hjust=0.5, face="bold")) +
    ggtitle("Mitochondrial gene expression in [%] from \n the xenografts treated with Copanlisib")+
    scale_fill_lancet()
The figures are showing different quality metrices for the copanlisib samples.

The figures are showing different quality metrices for the copanlisib samples.

#cell count
subset_alpelisib %>% 
    ggplot(aes(x=Xenograft, fill=Xenograft)) + 
    geom_bar() +
    theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
    theme(plot.title = element_text(hjust=0.5, face="bold")) +
    ggtitle("NCells from the xenografts treated with Alpelisib")+
    scale_fill_lancet()
The figures are showing different quality metrices for the alpelisib samples.

The figures are showing different quality metrices for the alpelisib samples.

# Extract identity and sample information from dataframe to determine the number of cells per sample_tag
n_cells_alp<- table(subset_alpelisib$Sample_Tag)

# View table
kable(n_cells_alp,
      col.names = c("Sample_Tag",
                    "nCells"),
      caption = "The table shows the number of cells which were treated with Alpelisib.")
The table shows the number of cells which were treated with Alpelisib.
Sample_Tag nCells
Alpelisib 17198
#UMI count
subset_alpelisib %>% 
    ggplot(aes(color=Xenograft, x=nUMI, fill=Xenograft)) + 
    geom_density(alpha = 0.2) + 
    scale_x_log10() +
    ylab("Cell density") +
    geom_vline(xintercept = 500)+
    theme(plot.title = element_text(hjust=0.5, face="bold")) +
    ggtitle("nUMI from the xenografts treated with Alpelisib")+
    scale_fill_lancet()
The figures are showing different quality metrices for the alpelisib samples.

The figures are showing different quality metrices for the alpelisib samples.

#Gene Count
subset_alpelisib %>% 
    ggplot(aes(color=Xenograft, x=nGene, fill=Xenograft)) + 
    geom_density(alpha = 0.2) +
    scale_x_log10() + 
    geom_vline(xintercept = 300)+
    theme(plot.title = element_text(hjust=0.5, face="bold")) +
    ggtitle("nGene from the xenografts treated with Alpelisib")+
    scale_fill_lancet()
The figures are showing different quality metrices for the alpelisib samples.

The figures are showing different quality metrices for the alpelisib samples.

#joint filtering effect
subset_alpelisib %>%
    ggplot(aes(x=nUMI, y=nGene)) +
    geom_point() +
      scale_colour_gradient(low = "gray90", high = "black") +
    stat_smooth(method=lm) +
    scale_x_log10() +
    scale_y_log10() +
    geom_vline(xintercept = 500) +
    geom_hline(yintercept = 250) +
    facet_wrap(~Sample_Tag)+
    scale_fill_lancet()
## `geom_smooth()` using formula = 'y ~ x'
The figures are showing different quality metrices for the alpelisib samples.

The figures are showing different quality metrices for the alpelisib samples.

#Mitochondrial count
subset_alpelisib %>% 
    ggplot(aes(color=Xenograft, x=percent.mt, fill=Xenograft)) + 
    geom_density(alpha = 0.2) + 
    scale_x_log10() + 
    geom_vline(xintercept = 2)+
    theme(plot.title = element_text(hjust=0.5, face="bold")) +
    ggtitle("Mitochondrial gene expression in [%] from \n the xenografts treated with Alpelisib")+
    scale_fill_lancet()
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 9 rows containing non-finite values (`stat_density()`).
The figures are showing different quality metrices for the alpelisib samples.

The figures are showing different quality metrices for the alpelisib samples.

merged_filtered_seurat_meta %>% 
    ggplot(aes(x=Xenograft, fill=nUMI)) + 
    geom_bar() +
    labs(x="Xenografts", y="Number of cells",
               title="Overview of the number of cells by xenografts and condition") +
    facet_grid(~Sample_Tag) +
    theme(axis.text.x = element_text(angle=45, vjust=1, hjust=1)) +
    scale_fill_lancet()
## Warning: The following aesthetics were dropped during statistical transformation: fill
## ℹ This can happen when ggplot fails to infer the correct grouping structure in the
##   data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical variable
##   into a factor?
## The following aesthetics were dropped during statistical transformation: fill
## ℹ This can happen when ggplot fails to infer the correct grouping structure in the
##   data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical variable
##   into a factor?
## The following aesthetics were dropped during statistical transformation: fill
## ℹ This can happen when ggplot fails to infer the correct grouping structure in the
##   data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical variable
##   into a factor?

#Investigation of unwanted variations - cell cycle phases and mitochondrial gene expression

load("~/work/PI3K/data/SeuratObjects/filtered/Human/filtered_seurat.RData")
filtered_seurat$nCount_RNA <- NULL
filtered_seurat$nFeature_RNA <- NULL
filtered_seurat$orig.ident <- NULL
load("~/work/PI3K/data/SuplementaryFiles/SeuratObjects/cycle.rda")
filtered_phase_seurat <- CellCycleScoring(filtered_seurat, 
                                 g2m.features = g2m_genes, 
                                 s.features = s_genes)
## Warning: The following features are not present in the object: RAD51, not
## searching for symbol synonyms
filtered_phase_seurat <- FindVariableFeatures(filtered_phase_seurat,
                                     selection.method = "vst",
                                     nfeatures = 2000,
                                     verbose = FALSE)
filtered_phase_seurat <- ScaleData(filtered_phase_seurat)
## Centering and scaling data matrix
filtered_phase_seurat <- RunPCA(filtered_phase_seurat)
## PC_ 1 
## Positive:  RETNLB, IL10, CPA3, SPRR3, AL355922.1, UPK1B, NKG7, IVL, HBQ1, TPSG1 
##     TPSD1, HBG1, OR6M1, ALAS2, KCCAT333, HBB, SPRR2A, NTS, WIF1, PCP4L1 
##     HBA2, LYPD2, ESM1, AGTR2, HBA1, OR51E1, CALCA, TNNC2, MT3, GPHA2 
## Negative:  RPS19, RPL11, RPLP1, RPS8, RPS12, RACK1, RPS13, RPS20, RPL37A, RPL27A 
##     RPS11, RPS24, TMSB10, NAP1L1, RPL15, RPS3, RPS6, RPL14, SEC61G, RPL5 
##     CALU, RPL13, ATP5F1E, LGALS1, RPS28, RPS16, RPL18, CD63, RPL35A, AC016739.1 
## PC_ 2 
## Positive:  RPS15A, PERP, RPL3, RPL26P19, SNHG5, HMGN1, RPL6, RPL21, SNHG6, LRRC75A-AS1 
##     RPL9, C1orf43, RPL22, RPL7A, RPL13A, AC024293.1, GGCT, RPL7L1, AC008026.1, ZFAS1 
##     CSTB, RPL7P9, COX6C, DSG2, SRP9, ADI1, EPCAM, GAS5, TFAP2A, COMMD6 
## Negative:  SPARC, COL1A2, COL3A1, COL1A1, COL5A2, DCN, FBN1, SERPING1, PCOLCE, COL5A1 
##     FKBP7, PAM, AEBP1, CREB3L1, NFIX, BMP1, SERPINF1, CCDC80, OLFML2B, FSTL1 
##     PDGFRA, TGFBR3, IGFBP7, FGFR1, EBF1, TGFBR2, CYR61, TUBA1A, CDH11, C1R 
## PC_ 3 
## Positive:  SAA1, CTSS, SAA3P, PFN1, F13A1, ARG1, SAA2, LYZ, BACH1-IT3, BEND3 
##     AC012618.1, AIF1, ACTG2, CTSC, ECM1, RPS15AP24, RPS26P52, RARS2, FABP5, PRDX1 
##     RPL35P4, CD72, AL162430.1, PF4V1, AL355802.1, C1QB, SEC61G, FABP5P7, MAF, C1QA 
## Negative:  MTND3P17, COL3A1, COL5A2, CTHRC1, LRRC75A-AS1, COL1A1, FNDC1, SNHG5, SDC2, COL1A2 
##     THBS2, PRAME, SPARC, ASPN, RPS15A, HOOK1, COL12A1, GPC3, RPL6, ITM2A 
##     SNHG6, LUM, PRRX1, FSTL1, PERP, EN1, GGCT, UCHL1, RPL26P19, GAS5 
## PC_ 4 
## Positive:  COL12A1, LIMA1, SERPINH1, TIMP3, MTND3P17, THBS2, SPATS2L, CAST, KRT6A, CAMK2N1 
##     IFI44, SLC16A2, IFI27, BST2, COL1A1, KCNMA1, FSTL1, PLS3, TNFSF10, SEMA3C 
##     KLK5, TPM1, GBP1, SAMD9, FHL2, OAS2, OAS3, F3, S100A2, RAB31 
## Negative:  RPS12, RPL37A, RPS8, RPS24, HLA-DRB1, RPS11, RPS19, HLA-DPB1, PRAME, RPL37 
##     RPS13, RPL27A, HLA-DQB1, CD74, AC008481.1, RPL31, CXCR4, RPL11, RPS7, RPS18 
##     RACK1, RPL12, HLA-DRB5, HLA-DRA, HLA-DRB6, HLA-DPA1, UCHL1, HLA-DQA1, LRRC75A-AS1, RPS27A 
## PC_ 5 
## Positive:  EFEMP1, TNXB, C1R, DCN, ACKR3, FCRL2, LGI2, OGN, C3, PLPP3 
##     ADAMTS5, COL14A1, CADM3, LBP, TGFBR2, NOVA1, GSN, CYGB, SLC4A4, TMEFF2 
##     HTRA3, THBS3, CXCL12, C1S, TGFBR3, AC011352.1, MT2P1, KLF4, LAMA2, MT2A 
## Negative:  COL12A1, NCAM1, CDH11, CALD1, COL8A1, PMEPA1, THBS2, TPM2, HS6ST2, ACTA2 
##     AC116917.1, CTGF, POSTN, TNC, GNG11, COL6A3, COL5A2, IGFBP7, LBH, SEMA7A 
##     CNN3, BMP1, THBS1, TPM1, OLFML2B, LOXL3, IGFBP3, COL5A1, TAGLN, ACTN1
DimPlot(filtered_phase_seurat,
        reduction = "pca",
        group.by= "Phase",
        split.by = "Phase")+
  ggtitle("PCA visualization of the cell cycle phases")+
  scale_fill_lancet()

filtered_phase_seurat <- RunUMAP(filtered_phase_seurat,
                             dims = 1:40,
                             reduction = "pca")
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 16:01:51 UMAP embedding parameters a = 0.9922 b = 1.112
## 16:01:51 Read 25592 rows and found 40 numeric columns
## 16:01:51 Using Annoy for neighbor search, n_neighbors = 30
## 16:01:51 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 16:01:56 Writing NN index file to temp file /tmp/RtmpircTMj/fileb65bd7b200354
## 16:01:56 Searching Annoy index using 1 thread, search_k = 3000
## 16:02:04 Annoy recall = 100%
## 16:02:05 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 16:02:07 Initializing from normalized Laplacian + noise (using irlba)
## 16:02:08 Commencing optimization for 200 epochs, with 1289790 positive edges
## 16:02:22 Optimization finished
DimPlot(filtered_phase_seurat,
        reduction = "umap",
        split.by = "Phase",
        group.by = "Sample_Tag") + 
  ggtitle("UMAP splitted by cell cycle phase \n and grouped by conditions") + 
  scale_fill_lancet()

FeaturePlot(filtered_phase_seurat,
            features = "percent.mt")+
   ggtitle("Mitochondrial gene expression distribution in clusters")+
   scale_fill_lancet()

#Integration

integrated_filtered_sct_cond_seurat <- readRDS("~/work/PI3K/data/SeuratObjects/integrated/Human/integrated_filtered_sct_cond_seurat.rds")
p1 <- DimPlot(integrated_filtered_sct_cond_seurat, reduction = "umap", group.by = "Sample_Tag") +
      labs(title = "UMAP visualisation of the condition distribution") +
      scale_fill_lancet()
p2 <- DimPlot(integrated_filtered_sct_cond_seurat, reduction = "umap", label = TRUE, repel = TRUE)+
      labs(title = "UMAP visualisation of the number of clusters") +
      scale_fill_lancet()
p3 <- DimPlot(integrated_filtered_sct_cond_seurat, reduction = "umap", group.by = "Xenograft")+
      labs(title = "UMAP visualisation of the xenograft distribution") +
      scale_fill_lancet()
p1 

p2 

p3 

#Clustering analysis

seurat_integrated <- readRDS("~/work/PI3K/data/SeuratObjects/clusteranalysis/Human/seurat_integrated_newidents.rds")
# Assign identity of clusters
Idents(object = seurat_integrated) <- "integrated_snn_res.1"
# Plot the UMAP
DimPlot(seurat_integrated,
        reduction = "umap",
        label = TRUE,
        label.size = 6) +
        labs(title = "UMAP visualisation shows the number of clusters \n for resolution 1.0") +
        scale_fill_lancet(name="ClusterIDs")

# Assign identity of clusters
Idents(object = seurat_integrated) <- "integrated_snn_res.0.2"
# Plot the UMAP
DimPlot(seurat_integrated,
        reduction = "umap",
        label = TRUE,
        label.size = 6)+
        labs(title = "UMAP visualisation shows the number of clusters \n for the resolution 0.2") +
        scale_fill_lancet(name="ClusterIDs")

# Assign identity of clusters
Idents(object = seurat_integrated) <- "integrated_snn_res.0.4"
# Plot the UMAP
DimPlot(seurat_integrated,
        reduction = "umap",
        label = TRUE,
        label.size = 6)+
        labs(title = "UMAP visualisation shows the number of clusters \n for the resolution 0.4") +
        scale_fill_lancet(name="ClusterIDs")

# Assign identity of clusters
Idents(object = seurat_integrated) <- "integrated_snn_res.0.6"
# Plot the UMAP
DimPlot(seurat_integrated,
        reduction = "umap",
        label = TRUE,
        label.size = 6)+
        labs(title = "UMAP visualisation shows the number of clusters \n for the resolution 0.6") +
        scale_fill_lancet(name="ClusterIDs")

# Assign identity of clusters
Idents(object = seurat_integrated) <- "integrated_snn_res.0.8"
# Plot the UMAP
DimPlot(seurat_integrated,
        reduction = "umap",
        label = TRUE,
        label.size = 6)+
        labs(title = "UMAP visualisation shows the number of clusters \n for the resolution 0.8") +
        scale_fill_lancet(name="ClusterIDs")

# Assign identity of clusters
Idents(object = seurat_integrated) <- "integrated_snn_res.1.4"
# Plot the UMAP
DimPlot(seurat_integrated,
        reduction = "umap",
        label = TRUE,
        label.size = 6)+
        labs(title = "UMAP visualisation shows the number of clusters \n for the resolution 1.4") +
        scale_fill_lancet(name="ClusterIDs")

# Assign identity of clusters
Idents(object = seurat_integrated) <- "integrated_snn_res.1.6"
# Plot the UMAP
DimPlot(seurat_integrated,
        reduction = "umap",
        label = TRUE,
        label.size = 6)+
        labs(title = "UMAP visualisation shows the number of clusters \n for the resolution 1.6") +
        scale_fill_lancet(name="ClusterIDs")

# Assign identity of clusters
Idents(object = seurat_integrated) <- "integrated_snn_res.1.8"
# Plot the UMAP
DimPlot(seurat_integrated,
        reduction = "umap",
        label = TRUE,
        label.size = 6)+
        labs(title = "UMAP visualisation shows the number of clusters \n for the resolution 1.8") +
        scale_fill_lancet(name="ClusterIDs")

# Assign identity of clusters
Idents(object = seurat_integrated) <- "integrated_snn_res.2"
# Plot the UMAP
DimPlot(seurat_integrated,
        reduction = "umap",
        label = TRUE,
        label.size = 6)+
        labs(title = "UMAP visualisation shows the number of clusters \n for the resolution 2.0") +
        scale_fill_lancet(name="ClusterIDs")

# Assign identity of clusters
Idents(object = seurat_integrated) <- "integrated_snn_res.2.4"
# Plot the UMAP
DimPlot(seurat_integrated,
        reduction = "umap",
        label = TRUE,
        label.size = 6)+
        labs(title = "UMAP visualisation shows the number of clusters \n for the resolution 2.4") +
        scale_fill_lancet(name="ClusterIDs")

#Celltype identification singleR

seurat_integrated_renameidents <- readRDS("~/work/PI3K/data/SeuratObjects/celltypeIdentification/Human/SingleR/seurat_integrated_singleR.rds")
DimPlot(object = seurat_integrated_renameidents, 
        reduction = "umap", 
        label = TRUE,
        label.size = 3,
        repel = TRUE) +
        labs(title = "UMAP visualisation shows the different celltypes \n for the resolution 0.4") +
        scale_fill_lancet(name="Celltypes")

n_cells0.4.long <- readRDS("~/work/PI3K/data/SeuratObjects/celltypeIdentification/Human/SingleR/n_cells0.4.rds")
plt_0.4 <- ggplot(n_cells0.4.long,aes(ClusterID, singlr_labels)) +
  geom_tile(aes(fill=Occurances)) +  
  geom_text(aes(label = round(Occurances, 1)), size=2) +
  scale_fill_gradient(low = "white", high = "red") +
  labs(y="Celltypes")

plt_0.4

#Label comparison

df_merged <- readRDS("~/work/PI3K/data/SeuratObjects/celltypeIdentification/df_merged_singleR_labels.rds")

plt_merged <- ggplot(df_merged,aes(x = singlr_labels.human, y= singlr_labels.mouse)) +
  geom_tile(aes(fill=Cell_Index_count)) +
  geom_text(aes(label = Cell_Index_count), size=1.5) +
  scale_fill_gradient(low = "white", high = "red") +
  labs(x="Celltypes_human", y="Celltypes_mouse") +
  theme(axis.text.x = element_text(angle=90))

plt_merged

#Celltype identification puram

seurat_integrated_renamedIdents <- readRDS("~/work/PI3K/data/SeuratObjects/celltypeIdentification/Human/Puram/seurat_integrated_puram.rds")
DimPlot(object = seurat_integrated_renamedIdents, 
        reduction = "umap", 
        label = TRUE,
        label.size = 3,
        repel = TRUE) +
        labs(title = "UMAP visualisation shows the different celltypes \n for the resolution 0.4") +
        scale_fill_lancet(name="Celltypes")

n_cells0.4.long <- readRDS("~/work/PI3K/data/SeuratObjects/celltypeIdentification/Human/Puram/n_cells0.4.rds")
plt_0.4 <- ggplot(n_cells0.4.long,aes(ClusterID, predicted.id)) +
  geom_tile(aes(fill=Occurances)) +  
  geom_text(aes(label = round(Occurances, 1)), size=2) +
  scale_fill_gradient(low = "white", high = "red") +
  labs(y="Celltypes")

plt_0.4

#Transcript analysis

umi_counts_merged <- read_delim("~/work/PI3K/data/SuplementaryFiles/CSVFiles/umi_count_merged.csv")
## Rows: 34609 Columns: 4
## ── Column specification ───────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Cell_Index, Labels
## dbl (2): UMIs_human, UMIs_mouse
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#create scatter plot
sp <- ggplot(umi_counts_merged, aes(x = UMIs_human, y = UMIs_mouse, fill=Labels)) + geom_point(aes(colour = Labels))
sp + geom_abline() +
  xlab("Transcripts human") +
  ylab("Transcripts mouse")

#Celltype identification demultiplexed

seurat_integrated <- readRDS("~/work/PI3K/data/SeuratObjects/cellfiltering_umi/Human/seurat_integrated_with_spec_labels.rds")
seurat_integrated_filtered_human <- readRDS("~/work/PI3K/data/SeuratObjects/demultiplexed/Human/seurat_integrated_filtered_human.rds")
UMAP_human <- DimPlot(seurat_integrated, reduction = "umap", group.by= "Labels", cols = c("red","lightgrey","purple")) + ggtitle("UMAP human cells")
UMAP_human

seurat_integrated_filtered_human <- RunUMAP(seurat_integrated_filtered_human, dims = 1:30)
## 16:04:31 UMAP embedding parameters a = 0.9922 b = 1.112
## 16:04:31 Read 25592 rows and found 30 numeric columns
## 16:04:31 Using Annoy for neighbor search, n_neighbors = 30
## 16:04:31 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 16:04:33 Writing NN index file to temp file /tmp/RtmpircTMj/fileb65bd6ac50948
## 16:04:33 Searching Annoy index using 1 thread, search_k = 3000
## 16:04:40 Annoy recall = 100%
## 16:04:41 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 16:04:43 Initializing from normalized Laplacian + noise (using irlba)
## 16:04:44 Commencing optimization for 200 epochs, with 1140710 positive edges
## 16:04:57 Optimization finished
DimPlot(seurat_integrated_filtered_human, reduction = "umap", group.by = "Sample_Tag") + ggtitle("UMAP human cells by conditions")

metadata <- seurat_integrated@meta.data
metadata %>% select(2,26) -> metadata
table(metadata)
##                     Labels
## Xenograft            mainly_human mainly_mouse unique_human
##   HN10621                      76         3560          628
##   HN10960                       0          160           32
##   HN11097_Control               4         1318          172
##   HN15239A_Alpelisib          106         7351         1815
##   HN15239A_Control             29        10127          214
prop.table(table(metadata))
##                     Labels
## Xenograft            mainly_human mainly_mouse unique_human
##   HN10621            0.0029696780 0.1391059706 0.0245389184
##   HN10960            0.0000000000 0.0062519537 0.0012503907
##   HN11097_Control    0.0001562988 0.0515004689 0.0067208503
##   HN15239A_Alpelisib 0.0041419193 0.2872381994 0.0709206002
##   HN15239A_Control   0.0011331666 0.3957095967 0.0083619881
seurat_integrated_renamedIdents <- readRDS("~/work/PI3K/data/SeuratObjects/celltypeIdentification/Human/Puram/seurat_integrated_puram.rds")
DimPlot(object = seurat_integrated_renamedIdents, 
        reduction = "umap", 
        label = TRUE,
        label.size = 3,
        repel = TRUE) +
        labs(title = "UMAP visualisation shows the different celltypes \n for the resolution 0.4") +
        scale_fill_lancet(name="Celltypes")

n_cells0.4.long <- readRDS("~/work/PI3K/data/SeuratObjects/celltypeIdentification/Human/Puram/n_cells0.4.rds")
plt_0.4 <- ggplot(n_cells0.4.long,aes(ClusterID, predicted.id)) +
  geom_tile(aes(fill=Occurances)) +  
  geom_text(aes(label = round(Occurances, 1)), size=2) +
  scale_fill_gradient(low = "white", high = "red") +
  labs(y="Celltypes")

plt_0.4